# PACKAGES REQUIRED
import pandas as pd
import numpy as np
from tableauscraper import TableauScraper as TS
from dash import Dash, html, Input, Output, dash_table
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import sklearn as sk
import sklearn.linear_model
import random
import seaborn as sns
from scipy import stats
# The repository link is below. SeolHaeJuan = Jared, for clarification.
# https://github.com/SeolHaeJuan/STA-141B-Project.git
# Only 2021 for data
url = "https://visualizedata.ucop.edu/t/Public/views/AdmissionsDataTable/TREthbyYr?:embed_code_version=3&:embed=y&:loadOrderID=0&:display_spinner=no&:showAppBanner=false&:display_count=n&:showVizHome=n&:origin=viz_share_link"
ts = TS()
ts.loads(url)
workbook = ts.getWorkbook()
for t in workbook.worksheets:
print(f"worksheet name : {t.name}") #show worksheet name
print(t.data) #show dataframe for this worksheet
worksheet name : TR by Year
School-value School-alias Calculation1-value \
0 ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE4002
1 ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE4002
2 ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE4002
3 ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE4002
4 ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE4002
... ... ... ...
2767 YUBA COLLEGE YUBA COLLEGE YUBA COLLEGE4994
2768 YUBA COLLEGE YUBA COLLEGE YUBA COLLEGE4994
2769 YUBA COLLEGE YUBA COLLEGE YUBA COLLEGE4994
2770 YUBA COLLEGE YUBA COLLEGE YUBA COLLEGE4994
2771 YUBA COLLEGE YUBA COLLEGE YUBA COLLEGE4994
Calculation1-alias County-value County-alias City-value \
0 ALLAN HANCOCK COLLEGE4002 Santa Barbara Santa Barbara Santa Maria
1 ALLAN HANCOCK COLLEGE4002 Santa Barbara Santa Barbara Santa Maria
2 ALLAN HANCOCK COLLEGE4002 Santa Barbara Santa Barbara Santa Maria
3 ALLAN HANCOCK COLLEGE4002 Santa Barbara Santa Barbara Santa Maria
4 ALLAN HANCOCK COLLEGE4002 Santa Barbara Santa Barbara Santa Maria
... ... ... ... ...
2767 YUBA COLLEGE4994 Yuba Yuba Marysville
2768 YUBA COLLEGE4994 Yuba Yuba Marysville
2769 YUBA COLLEGE4994 Yuba Yuba Marysville
2770 YUBA COLLEGE4994 Yuba Yuba Marysville
2771 YUBA COLLEGE4994 Yuba Yuba Marysville
City-alias Count-value Count-alias Uad Uc Ethn 6 Cat-value \
0 Santa Maria Enr Enr International
1 Santa Maria Adm Adm International
2 Santa Maria App App International
3 Santa Maria Enr Enr Unknown
4 Santa Maria Enr Enr Unknown
... ... ... ... ...
2767 Marysville Adm Adm African American
2768 Marysville App App African American
2769 Marysville Enr Enr All
2770 Marysville Adm Adm All
2771 Marysville App App All
Uad Uc Ethn 6 Cat-alias SUM(Pivot Field Values)-alias
0 Inter- national %null%
1 Inter- national 3
2 Inter- national 5
3 Domestic unknown %null%
4 Domestic unknown %null%
... ... ...
2767 African American 3
2768 African American 4
2769 All 40
2770 All 52
2771 All 64
[2772 rows x 13 columns]
mainframe = t.data
mainframe.drop(mainframe.columns[[1, 2, 3, 4, 7, 8, 11]], axis=1, inplace=True)
mainframe.rename(columns = {'School-value': 'School', 'County-alias': 'County', 'City-value': 'City', 'Count-alias': 'Type', 'Uad Uc Ethn 6 Cat-value': 'Eth','SUM(Pivot Field Values)-alias':'Value'}, inplace = True)
mainframe
| School | County | City | Type | Eth | Value | |
|---|---|---|---|---|---|---|
| 0 | ALLAN HANCOCK COLLEGE | Santa Barbara | Santa Maria | Enr | International | %null% |
| 1 | ALLAN HANCOCK COLLEGE | Santa Barbara | Santa Maria | Adm | International | 3 |
| 2 | ALLAN HANCOCK COLLEGE | Santa Barbara | Santa Maria | App | International | 5 |
| 3 | ALLAN HANCOCK COLLEGE | Santa Barbara | Santa Maria | Enr | Unknown | %null% |
| 4 | ALLAN HANCOCK COLLEGE | Santa Barbara | Santa Maria | Enr | Unknown | %null% |
| ... | ... | ... | ... | ... | ... | ... |
| 2767 | YUBA COLLEGE | Yuba | Marysville | Adm | African American | 3 |
| 2768 | YUBA COLLEGE | Yuba | Marysville | App | African American | 4 |
| 2769 | YUBA COLLEGE | Yuba | Marysville | Enr | All | 40 |
| 2770 | YUBA COLLEGE | Yuba | Marysville | Adm | All | 52 |
| 2771 | YUBA COLLEGE | Yuba | Marysville | App | All | 64 |
2772 rows × 6 columns
mainframe.loc[mainframe["Value"] == "%null%", "Value"] = 0
# Dataframe subsetting
enr_frame = mainframe.loc[(mainframe['Type'] == 'Enr') & (mainframe['Eth'] != 'All')]
adm_frame = mainframe.loc[(mainframe['Type'] == 'Adm') & (mainframe['Eth'] != 'All')]
app_frame = mainframe.loc[(mainframe['Type'] == 'App') & (mainframe['Eth'] != 'All')]
tot_frame = mainframe[mainframe['Eth'] == 'All']
# Resetting index so we can properly divide the columns later
enr_frame = enr_frame.reset_index()
adm_frame = adm_frame.reset_index()
app_frame = app_frame.reset_index()
# Getting rid of index col
enr_frame.drop(enr_frame.columns[[0]],axis=1,inplace=True)
adm_frame.drop(adm_frame.columns[[0]], axis = 1, inplace = True)
app_frame.drop(app_frame.columns[[0]],axis=1,inplace=True)
# Switching to float for division later
app_frame['Value'] = pd.to_numeric(app_frame['Value'])
adm_frame['Value'] = pd.to_numeric(adm_frame['Value']) # Focusing on this frame
enr_frame['Value'] = pd.to_numeric(adm_frame['Value'])
tot_frame[tot_frame.Type == 'Adm'].sort_values(by=['Value'], ascending = False)
# Rate column
pd.set_option('mode.use_inf_as_na', True)
app_frame['Value'] = pd.to_numeric(app_frame['Value'])
adm_frame['Value'] = pd.to_numeric(adm_frame['Value'])
adm_frame['Rate'] = adm_frame['Value']/app_frame['Value']
app_frame['Rate'] = adm_frame['Value']/app_frame['Value']
adm_frame = adm_frame.fillna(0)
app_frame = app_frame.fillna(0)
# Getting rid of unknowns. Better to do it after we calculated rate to not mess with positioning
adm_frame = adm_frame[adm_frame['Eth']!= 'Unknown']
app_frame = app_frame[app_frame['Eth']!= 'Unknown']
enr_frame = enr_frame[enr_frame['Eth']!= 'Unknown']
adm_frame.loc[adm_frame['Rate'] == 1.0].head(5)
| School | County | City | Type | Eth | Value | Rate | Binary Rate | |
|---|---|---|---|---|---|---|---|---|
| 43 | BERKELEY CITY COLLEGE | Alameda | Berkeley | Adm | American Indian | 3 | 1.0 | 1 |
| 45 | BUTTE COLLEGE | Butte | Oroville | Adm | International | 6 | 1.0 | 1 |
| 48 | BUTTE COLLEGE | Butte | Oroville | Adm | Asian | 11 | 1.0 | 1 |
| 57 | CABRILLO COLLEGE | Santa Cruz | Aptos | Adm | American Indian | 4 | 1.0 | 1 |
| 129 | COLLEGE OF MARIN | Marin | Kentfield | Adm | International | 4 | 1.0 | 1 |
adm_frame.loc[adm_frame['Rate'] == 1.0]['Eth'].value_counts()
International 11 American Indian 5 African American 4 Asian 2 Chicano/Latino 2 White 1 Name: Eth, dtype: int64
sorty = adm_frame.sort_values('Rate', ascending=False) # Highest admit rate schools per eth.
sorty[sorty['Value'] > 10].head(10)
| School | County | City | Type | Eth | Value | Rate | |
|---|---|---|---|---|---|---|---|
| 48 | BUTTE COLLEGE | Butte | Oroville | Adm | Asian | 11 | 1.000000 |
| 728 | SIERRA COLLEGE | Placer | Rocklin | Adm | African American | 11 | 1.000000 |
| 133 | COLLEGE OF MARIN | Marin | Kentfield | Adm | Asian | 19 | 0.950000 |
| 477 | MIRACOSTA COLLEGE | San Diego | Oceanside | Adm | International | 33 | 0.942857 |
| 120 | COASTLINE COMMUNITY COLLEGE | Orange | Fountain Valley | Adm | Chicano/Latino | 16 | 0.941176 |
| 507 | MOORPARK COLLEGE | Ventura | Moorpark | Adm | International | 15 | 0.937500 |
| 675 | SANTA ANA COLLEGE | Orange | Santa Ana | Adm | International | 13 | 0.928571 |
| 785 | WEST LOS ANGELES COLLEGE | Los Angeles | Culver City | Adm | International | 38 | 0.926829 |
| 672 | SAN JOSE CITY COLLEGE | Santa Clara | San Jose | Adm | Asian | 24 | 0.923077 |
| 128 | COLLEGE OF ALAMEDA | Alameda | Alameda | Adm | African American | 12 | 0.923077 |
# GPA
url = "https://visualizedata.ucop.edu/t/Public/views/AdmissionsDataTable/TRGPAbyYr?:embed_code_version=3&:embed=y&:loadOrderID=0&:display_spinner=no&:showAppBanner=false&:display_count=n&:showVizHome=n&:origin=viz_share_link"
ts = TS()
ts.loads(url)
workbook = ts.getWorkbook()
for t in workbook.worksheets:
print(f"worksheet name : {t.name}") #show worksheet name
print(t.data) #show dataframe for this worksheet
gpa = t.data
gpa.drop(gpa.columns[[1, 2, 4, 5, 6, 8]], axis=1, inplace=True)
gpa.rename(columns = {'School-value': 'School', 'City-alias': 'City', 'County-alias': 'County', 'Measure Names-alias': 'Type', 'Measure Values-alias':'GPA'}, inplace = True)
worksheet name : TR GPA by Year
School-value School-alias City-value \
0 ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE Santa Maria
1 ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE Santa Maria
2 ALLAN HANCOCK COLLEGE ALLAN HANCOCK COLLEGE Santa Maria
3 AMERICAN RIVER COLLEGE AMERICAN RIVER COLLEGE Sacramento
4 AMERICAN RIVER COLLEGE AMERICAN RIVER COLLEGE Sacramento
.. ... ... ...
331 WOODLAND COMMUNITY COLLEGE WOODLAND COMMUNITY COLLEGE Woodland
332 WOODLAND COMMUNITY COLLEGE WOODLAND COMMUNITY COLLEGE Woodland
333 YUBA COLLEGE YUBA COLLEGE Marysville
334 YUBA COLLEGE YUBA COLLEGE Marysville
335 YUBA COLLEGE YUBA COLLEGE Marysville
City-alias Calculation1-value \
0 Santa Maria ALLAN HANCOCK COLLEGE004002
1 Santa Maria ALLAN HANCOCK COLLEGE004002
2 Santa Maria ALLAN HANCOCK COLLEGE004002
3 Sacramento AMERICAN RIVER COLLEGE004004
4 Sacramento AMERICAN RIVER COLLEGE004004
.. ... ...
331 Woodland WOODLAND COMMUNITY COLLEGE005762
332 Woodland WOODLAND COMMUNITY COLLEGE005762
333 Marysville YUBA COLLEGE004994
334 Marysville YUBA COLLEGE004994
335 Marysville YUBA COLLEGE004994
Calculation1-alias County-value County-alias \
0 ALLAN HANCOCK COLLEGE004002 Santa Barbara Santa Barbara
1 ALLAN HANCOCK COLLEGE004002 Santa Barbara Santa Barbara
2 ALLAN HANCOCK COLLEGE004002 Santa Barbara Santa Barbara
3 AMERICAN RIVER COLLEGE004004 Sacramento Sacramento
4 AMERICAN RIVER COLLEGE004004 Sacramento Sacramento
.. ... ... ...
331 WOODLAND COMMUNITY COLLEGE005762 Yolo Yolo
332 WOODLAND COMMUNITY COLLEGE005762 Yolo Yolo
333 YUBA COLLEGE004994 Yuba Yuba
334 YUBA COLLEGE004994 Yuba Yuba
335 YUBA COLLEGE004994 Yuba Yuba
Measure Names-value Measure Names-alias \
0 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... Enrl GPA
1 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... Adm GPA
2 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... App GPA
3 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... Enrl GPA
4 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... Adm GPA
.. ... ...
331 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... Adm GPA
332 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... App GPA
333 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... Enrl GPA
334 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... Adm GPA
335 [federated.11exkwi1b9bzff10j00kr0sac218].[sum:... App GPA
Measure Values-alias
0 3.45
1 3.43
2 3.34
3 3.56
4 3.53
.. ...
331 3.59
332 3.41
333 3.58
334 3.53
335 3.40
[336 rows x 11 columns]
# Subsetting GPA so we only get the one's we want
enr_gpa = gpa[gpa.Type == 'Enrl GPA']
adm_gpa = gpa[gpa.Type == 'Adm GPA']
app_gpa = gpa[gpa.Type == 'App GPA']
adm_gpa = adm_gpa[['School', 'County', 'City', 'Type', 'GPA']]
adm_gpa['Type'] = adm_gpa['Type'].str.replace('GPA', '')
df_all = adm_gpa.merge(adm_max.drop_duplicates(), on=['School', 'County', 'City'],
how='left', indicator=True)
df_all.drop(df_all.columns[[3, 9]], axis=1, inplace=True)
df_all.rename(columns = {'Type_y': 'Type'}, inplace = True)
final_frame = df_all
final_frame.Eth.value_counts()
# This is the count of what ethnicity each college's max amount of admits is equal to
Chicano/Latino 51 White 30 Asian 26 International 4 Name: Eth, dtype: int64
final_frame.loc[final_frame['Rate'] == 1.0]['Eth'].value_counts() # only one school with 100% admit rate
Chicano/Latino 1 Name: Eth, dtype: int64
final_frame.loc[final_frame['Rate'] == 1.0] # Not many people transferred from here
| School | County | City | GPA | Type | Eth | Value | Rate | |
|---|---|---|---|---|---|---|---|---|
| 106 | WEST HILLS COLLEGE COALINGA | Fresno | Coalinga | 3.57 | Adm | Chicano/Latino | 4.0 | 1.0 |
final_frame = adm_frame.sort_values('Value', ascending=False).drop_duplicates('School').sort_index()
# final_frame is the high for each school, where eth is the max amount of students admitted
# Sorting by descending values. These schools had the most students admitted.
final_frame.sort_values('Value', ascending = False)
| School | County | City | Type | Eth | Value | Rate | Binary Rate | |
|---|---|---|---|---|---|---|---|---|
| 589 | PASADENA CITY COLLEGE | Los Angeles | Pasadena | Adm | Asian | 588 | 0.790323 | 1 |
| 242 | DE ANZA COLLEGE | Santa Clara | Cupertino | Adm | Asian | 566 | 0.782849 | 1 |
| 693 | SANTA MONICA COLLEGE | Los Angeles | Santa Monica | Adm | White | 492 | 0.772370 | 1 |
| 685 | SANTA BARBARA CITY COLLEGE | Santa Barbara | Santa Barbara | Adm | White | 473 | 0.828371 | 1 |
| 616 | RIVERSIDE CITY COLLEGE | Riverside | Riverside | Adm | Chicano/Latino | 352 | 0.733333 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 199 | COPPER MOUNTAIN COLLEGE | San Bernardino | Joshua Tree | Adm | White | 7 | 0.875000 | 1 |
| 278 | FEATHER RIVER COLLEGE | Plumas | Quincy | Adm | White | 4 | 0.666667 | 1 |
| 779 | WEST HILLS COLLEGE COALINGA | Fresno | Coalinga | Adm | Chicano/Latino | 4 | 1.000000 | 1 |
| 177 | COLLEGE OF THE SISKIYOUS | Siskiyou | Weed | Adm | Chicano/Latino | 3 | 0.600000 | 1 |
| 385 | LASSEN COLLEGE | Lassen | Susanville | Adm | White | 0 | 0.000000 | 0 |
113 rows × 8 columns
one_outs = adm_frame.loc[adm_frame['Rate'] == 1.0]
inters = one_outs[one_outs['Eth'] == 'International']['School'] # The 11 schools with 100% international student admittance
result1 = final_frame[final_frame['School'].isin(list(inters))]
result1.head(20)
# From this result, we see that all the schools with a 100% international student admit rate, do not have international students as their ethnicity with max admittance.
| School | County | City | Type | Eth | Value | Rate | Binary Rate | |
|---|---|---|---|---|---|---|---|---|
| 47 | BUTTE COLLEGE | Butte | Oroville | Adm | White | 29 | 0.763158 | 1 |
| 132 | COLLEGE OF MARIN | Marin | Kentfield | Adm | White | 69 | 0.683168 | 1 |
| 208 | COSUMNES RIVER COLLEGE | Sacramento | Sacramento | Adm | Asian | 89 | 0.729508 | 1 |
| 283 | FOLSOM LAKE COLLEGE | Sacramento | Folsom | Adm | White | 84 | 0.770642 | 1 |
| 407 | LOS ANGELES HARBOR COLLEGE | Los Angeles | Wilmington | Adm | Chicano/Latino | 54 | 0.701299 | 1 |
| 467 | MERCED COLLEGE | Merced | Merced | Adm | Chicano/Latino | 90 | 0.671642 | 1 |
| 474 | MERRITT COLLEGE | Alameda | Oakland | Adm | Chicano/Latino | 21 | 0.777778 | 1 |
| 488 | MISSION COLLEGE | Santa Clara | Santa Clara | Adm | Asian | 62 | 0.738095 | 1 |
| 551 | NORCO COLLEGE | Riverside | Norco | Adm | Chicano/Latino | 92 | 0.686567 | 1 |
| 574 | OXNARD COLLEGE | Ventura | Oxnard | Adm | Chicano/Latino | 55 | 0.833333 | 1 |
| 763 | VENTURA COLLEGE | Ventura | Ventura | Adm | Chicano/Latino | 112 | 0.722581 | 1 |
# Our averages across all for adm_frame
print("Mean Number of Admitted Students:", adm_frame['Value'].mean())
print("Mean Admit Rate Across All CCs", adm_frame['Rate'].mean())
Mean Number of Admitted Students: 45.096930533117934 Mean Admit Rate Across All CCs 0.6021040155998523
final_frame.mean(numeric_only = True)
e_list = ['African American', 'American Indian', 'Asian', 'Chicano/Latino', 'International', 'White']
def get_val(x):
for i in e_list:
print(x[x['Eth'] == i].mean(numeric_only = True))
get_val(adm_frame)
get_val(final_frame) # First two are zeroes, as they were not in the final frame
Value 13.000000 Rate 0.558193 Binary Rate 0.754545 dtype: float64 Value 1.525000 Rate 0.251741 Binary Rate 0.312500 dtype: float64 Value 66.321429 Rate 0.664115 Binary Rate 0.875000 dtype: float64 Value 75.531532 Rate 0.732823 Binary Rate 0.990991 dtype: float64 Value 33.319149 Rate 0.596481 Binary Rate 0.723404 dtype: float64 Value 66.241071 Rate 0.708647 Binary Rate 0.946429 dtype: float64 Value NaN Rate NaN Binary Rate NaN dtype: float64 Value NaN Rate NaN Binary Rate NaN dtype: float64 Value 164.307692 Rate 0.774025 Binary Rate 1.000000 dtype: float64 Value 76.961538 Rate 0.717070 Binary Rate 1.000000 dtype: float64 Value 112.750000 Rate 0.875353 Binary Rate 1.000000 dtype: float64 Value 128.870968 Rate 0.735716 Binary Rate 0.967742 dtype: float64
# Does not appear to be normally distributed
sns.kdeplot(
data=adm_frame, x="Rate", hue="Eth",
fill=True, common_norm=False, palette="crest",
alpha=.5, linewidth=0,
)
<AxesSubplot:xlabel='Rate', ylabel='Density'>
# Adm Frame in pretty format
fig = go.Figure(data=[go.Table(
header=dict(values=list(adm_frame.columns),
fill_color='cornflowerblue',
align='left'),
cells=dict(values=[adm_frame.School, adm_frame.County, adm_frame.City, adm_frame.Type, adm_frame.Eth, adm_frame.Value, adm_frame.Rate],
fill_color='ivory',
align='left'))
])
fig.update_layout(
title_text = "Admit Frame",
title_font_size=30,
font_family="Times New Roman",
font_color="black",
title_font_family="Times New Roman",
title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 12))
fig.show()
# Plots
fig = px.box(adm_frame, x="Rate", y="Eth", color="Eth", points = "all",
labels={
"Rate": "Admittance Rate",
"Eth": "Ethnicity",
},
title="Boxplot of Admittance Rate Per Each Ethnicity")
fig.show()
# Changing for outliers, and the fact that some schools have zero from int and native american
nhk = adm_frame[adm_frame['Rate'] > 0.2]
fig = px.box(nhk, x="Rate", y="Eth", color="Eth", points = "all",
labels={
"Rate": "Admittance Rate",
"Eth": "Ethnicity",
},
title="Boxplot of Admittance Rate Per Each Ethnicity")
fig.show()
test_frame = adm_frame.loc[adm_frame['Rate']>0.2]
fig = px.scatter(test_frame,
x = 'Value',
y = 'Rate',
template = 'plotly_dark',
color = 'Eth',
trendline = 'ols',
title = 'Admit Rate Per Different Ethnicities')
fig.update_layout(showlegend=True)
fig.show()
sns.set(rc={'figure.figsize':(8,10)})
sns.pointplot(data=adm_frame, x="Rate", y="Value", hue="Eth", title = "Point Plot of Admitted Ethnicities")
# Final frame in pretty, interactive format
fig = go.Figure(data=[go.Table(
header=dict(values=list(final_frame.columns),
fill_color='lavenderblush',
align='left'),
cells=dict(values=[final_frame.School, final_frame.County, final_frame.City, final_frame.GPA, final_frame.Type, final_frame.Eth, final_frame.Value, final_frame.Rate],
fill_color='lavender',
align='left'))
])
fig.update_layout(
title_text = "Final Frame | Each CC's Most Admitted Ethnicity",
title_font_size=30,
font_family="Times New Roman",
font_color="black",
title_font_family="Times New Roman",
title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 10))
fig.show()
# Rate is not normally distributed across entire datatset
from scipy import stats
stats.shapiro(adm_frame['Rate'])
ShapiroResult(statistic=0.7629678249359131, pvalue=6.721139208871741e-29)
# Test for normality of variables, each rate associated is not normal
from scipy.stats import shapiro
for i in e_list:
print(i, shapiro(adm_frame[adm_frame.Eth == i].Rate),'\n')
African American ShapiroResult(statistic=0.8359304070472717, pvalue=1.0530741834102741e-09) American Indian ShapiroResult(statistic=0.638819694519043, pvalue=9.998421361678833e-13) Asian ShapiroResult(statistic=0.6992810368537903, pvalue=7.828422155312098e-14) Chicano/Latino ShapiroResult(statistic=0.7622840404510498, pvalue=4.056537224184087e-12) International ShapiroResult(statistic=0.7731039524078369, pvalue=9.79381784005362e-11) White ShapiroResult(statistic=0.6323310136795044, pvalue=2.4369747756228244e-15)
# TEST FOR DIFFERENCE OF MEANS
from scipy import stats
stats.kruskal(adm_frame[adm_frame.Eth == 'African American'].Rate,
adm_frame[adm_frame.Eth == 'American Indian'].Rate,
adm_frame[adm_frame.Eth == 'Asian'].Rate,
adm_frame[adm_frame.Eth == 'Chicano/Latino'].Rate,
adm_frame[adm_frame.Eth == 'International'].Rate,
adm_frame[adm_frame.Eth == 'White'].Rate)
# DIFFERENCE OF MEANS
KruskalResult(statistic=84.75056525805913, pvalue=8.491057372967863e-17)
## FOR NORMALITY TEST OF TRANSFORMED FRAME JUST TO SEE
stats.kruskal(minion_frame[minion_frame.Eth == 'African American'].Rate,
minion_frame[minion_frame.Eth == 'American Indian'].Rate,
minion_frame[minion_frame.Eth == 'Asian'].Rate,
minion_frame[minion_frame.Eth == 'Chicano/Latino'].Rate,
minion_frame[minion_frame.Eth == 'International'].Rate,
minion_frame[minion_frame.Eth == 'White'].Rate)
### Function to allow us to get the tukey table for what we want to ask.
def tukey_table(variable):
gen_title = "Tukey Table for Comparisons of "
fig = go.Figure(data=[go.Table(
header=dict(values=list(tukey_sum.columns),
fill_color='lavenderblush',
align='left'),
cells=dict(values=[tukey_sum.group1, tukey_sum.group2, tukey_sum.meandiff, tukey_sum['p-adj'], tukey_sum.lower, tukey_sum.upper, tukey_sum.reject],
fill_color='lavender',
align='left'))
])
fig.update_layout(
title_text = gen_title + variable,
title_x=0.5,
title_font_size=30,
font_family="Times New Roman",
font_color="black",
title_font_family="Times New Roman",
title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 10))
fig.show()
def tukey_grab(frame, variable):
comp = []
gru = []
for i in e_list:
x = list(frame[frame.Eth == i][variable])
comp = x + comp
y = list(np.repeat([i], repeats = len(x)))
gru = y + gru
# Creating df for our list of rates and eths
diff_frame = pd.DataFrame({variable: comp, 'Eth': gru})
# actual tukey test
tukey = pairwise_tukeyhsd(endog=diff_frame[variable].rank(), # USED RANK FOR NONPARAMETRIC
groups=diff_frame['Eth'],
alpha=0.05)
global tukey_sum # Need to make it global so it works on the other function
tukey_sum = pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
tukey_table(variable)
print(tukey)
# Convert to dataframe for figure
tukey_grab(adm_frame, 'Rate')
Multiple Comparison of Means - Tukey HSD, FWER=0.05
==========================================================================
group1 group2 meandiff p-adj lower upper reject
--------------------------------------------------------------------------
African American American Indian -66.5994 0.0715 -136.4256 3.2267 False
African American Asian 101.6881 0.0001 37.8979 165.4783 True
African American Chicano/Latino 110.7976 0.0 46.8652 174.73 True
African American International 112.0691 0.0 45.3212 178.8169 True
African American White 112.4872 0.0 48.697 176.2774 True
American Indian Asian 168.2875 0.0 98.7243 237.8507 True
American Indian Chicano/Latino 177.397 0.0 107.7034 247.0906 True
American Indian International 178.6685 0.0 106.3834 250.9535 True
American Indian White 179.0866 0.0 109.5234 248.6498 True
Asian Chicano/Latino 9.1095 0.9985 -54.5355 72.7546 False
Asian International 10.381 0.9978 -56.0917 76.8537 False
Asian White 10.7991 0.9966 -52.7031 74.3013 False
Chicano/Latino International 1.2715 1.0 -65.3377 67.8807 False
Chicano/Latino White 1.6896 1.0 -61.9555 65.3346 False
International White 0.4181 1.0 -66.0546 66.8908 False
--------------------------------------------------------------------------
## DIFF IN VALUE
tukey_grab(adm_frame, 'Value')
Multiple Comparison of Means - Tukey HSD, FWER=0.05
============================================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------------------------------
African American American Indian -146.6199 0.0 -204.47 -88.7698 True
African American Asian 133.3953 0.0 80.5459 186.2447 True
African American Chicano/Latino 209.5305 0.0 156.5633 262.4977 True
African American International -0.6296 1.0 -55.9294 54.6702 False
African American White 144.9221 0.0 92.0727 197.7715 True
American Indian Asian 280.0152 0.0 222.383 337.6474 True
American Indian Chicano/Latino 356.1504 0.0 298.4101 413.8907 True
American Indian International 145.9903 0.0 86.103 205.8776 True
American Indian White 291.542 0.0 233.9097 349.1742 True
Asian Chicano/Latino 76.1352 0.0006 23.4061 128.8644 True
Asian International -134.0249 0.0 -189.0967 -78.9531 True
Asian White 11.5268 0.9891 -41.084 64.1376 False
Chicano/Latino International -210.1601 0.0 -265.345 -154.9752 True
Chicano/Latino White -64.6084 0.0065 -117.3376 -11.8793 True
International White 145.5517 0.0 90.4798 200.6235 True
----------------------------------------------------------------------------
## DIFFERENCE IN GPA
e_list = ['Asian', 'Chicano/Latino', 'International', 'White']
tukey_grab(final_frame, 'GPA')
Multiple Comparison of Means - Tukey HSD, FWER=0.05
======================================================================
group1 group2 meandiff p-adj lower upper reject
----------------------------------------------------------------------
Asian Chicano/Latino -34.5754 0.0 -52.1088 -17.0421 True
Asian International 2.7212 0.9979 -36.3571 41.7994 False
Asian White -3.2038 0.9734 -22.6995 16.2918 False
Chicano/Latino International 37.2966 0.0544 -0.4831 75.0762 False
Chicano/Latino White 31.3716 0.0 14.6303 48.1128 True
International White -5.925 0.9783 -44.6544 32.8044 False
----------------------------------------------------------------------
## FOR NORMALITY TEST OF TRANSFORMED FRAME
# If we were to lose a portion of the data, we would get normality in all except for African American
minion_frame = adm_frame[adm_frame.Rate > 0.20]
e_list = ['African American', 'American Indian', 'Asian', 'Chicano/Latino', 'International', 'White']
for i in e_list:
print(i, shapiro(minion_frame[minion_frame.Eth == i].Rate),'\n')
African American ShapiroResult(statistic=0.9744502305984497, pvalue=0.06744244694709778) American Indian ShapiroResult(statistic=0.9111583232879639, pvalue=0.03235141932964325) Asian ShapiroResult(statistic=0.943628191947937, pvalue=0.0003231678856536746) Chicano/Latino ShapiroResult(statistic=0.9594532251358032, pvalue=0.0020372187718749046) International ShapiroResult(statistic=0.9388893246650696, pvalue=0.0021637685131281614) White ShapiroResult(statistic=0.9669060707092285, pvalue=0.009036525152623653)
stats.kruskal(minion_frame[minion_frame.Eth == 'African American'].Rate,
minion_frame[minion_frame.Eth == 'American Indian'].Rate,
minion_frame[minion_frame.Eth == 'Asian'].Rate,
minion_frame[minion_frame.Eth == 'Chicano/Latino'].Rate,
minion_frame[minion_frame.Eth == 'International'].Rate,
minion_frame[minion_frame.Eth == 'White'].Rate)
KruskalResult(statistic=70.32613987310312, pvalue=8.765120982874429e-14)
get_val(minion_frame) # Means look much more similar
Value 15.543478 Rate 0.667405 Binary Rate 0.902174 dtype: float64 Value 4.880000 Rate 0.805571 Binary Rate 1.000000 dtype: float64 Value 74.280000 Rate 0.743809 Binary Rate 0.980000 dtype: float64 Value 76.218182 Rate 0.739485 Binary Rate 1.000000 dtype: float64 Value 45.391304 Rate 0.812597 Binary Rate 0.985507 dtype: float64 Value 69.336449 Rate 0.741762 Binary Rate 0.990654 dtype: float64
# Logistic Regression
from sklearn.model_selection import train_test_split
train, test = train_test_split(adm_frame, stratify = adm_frame['Eth'], train_size = .70)
train.head()
| School | County | City | Type | Eth | Value | Rate | |
|---|---|---|---|---|---|---|---|
| 505 | MONTEREY PENINSULA COLLEGE | Monterey | Monterey | Adm | American Indian | 5 | 0.833333 |
| 181 | COLUMBIA COLLEGE | Tuolumne | Sonora | Adm | White | 17 | 0.708333 |
| 406 | LOS ANGELES HARBOR COLLEGE | Los Angeles | Wilmington | Adm | Asian | 4 | 0.333333 |
| 771 | VICTOR VALLEY COLLEGE | San Bernardino | Victorville | Adm | Chicano/Latino | 61 | 0.717647 |
| 513 | MOORPARK COLLEGE | Ventura | Moorpark | Adm | American Indian | 4 | 0.666667 |
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train[['Rate']], train['Eth'])
LogisticRegression()
# Our coefficients
test.copy()
test['Predicted'] = lr.predict(test[['Rate']])
lr.coef_, lr.intercept_
(array([[-0.33648891],
[-2.28982245],
[ 0.481722 ],
[ 1.43885032],
[-0.26759658],
[ 0.97333563]]),
array([ 0.34549583, 0.96903151, -0.15688824, -0.81784256, 0.1501606 ,
-0.48995715]))
coef = np.vstack((lr.coef_.T, lr.intercept_))
coef
array([[-0.33648891, -2.28982245, 0.481722 , 1.43885032, -0.26759658,
0.97333563],
[ 0.34549583, 0.96903151, -0.15688824, -0.81784256, 0.1501606 ,
-0.48995715]])
# Sigmoid function to make scores more legible, and easy to understand
def sigmoid(x):
return 1 / (1 + np.exp(-x))
scores = sigmoid(test.iloc[:, 5:7] @ coef[:, :7])
scores = scores.set_axis([c+"-score" for c in lr.classes_],axis = 1)
/Users/fish/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py:402: RuntimeWarning: overflow encountered in exp
log_frame = pd.concat((test,scores), axis = 1)
log_frame.head(5)
| School | County | City | Type | Eth | Value | Rate | Predicted | African American-score | American Indian-score | Asian-score | Chicano/Latino-score | International-score | White-score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 137 | COLLEGE OF SAN MATEO | San Mateo | San Mateo | Adm | International | 117 | 0.866667 | Chicano/Latino | 1.076994e-17 | 1.030940e-116 | 1.000000 | 1.000000 | 2.879251e-14 | 1.000000 |
| 8 | AMERICAN RIVER COLLEGE | Sacramento | Sacramento | Adm | International | 9 | 0.900000 | Chicano/Latino | 6.195126e-02 | 2.683152e-09 | 0.985143 | 0.999995 | 9.336489e-02 | 0.999756 |
| 668 | SAN JOAQUIN DELTA COLLEGE | San Joaquin | Stockton | Adm | African American | 11 | 0.687500 | White | 3.035851e-02 | 2.240274e-11 | 0.994465 | 1.000000 | 5.518349e-02 | 0.999969 |
| 366 | LAKE TAHOE COMMUNITY COLLEGE | El Dorado | South Lake Tahoe | Adm | Asian | 7 | 0.636364 | Asian | 1.056882e-01 | 2.025861e-07 | 0.963463 | 0.999929 | 1.445973e-01 | 0.998501 |
| 336 | GROSSMONT CMTY COLLEGE | San Diego | El Cajon | Adm | Asian | 35 | 0.729167 | Chicano/Latino | 9.878028e-06 | 3.168562e-35 | 1.000000 | 1.000000 | 9.549050e-05 | 1.000000 |
# The probability that the predicted ethnicity was equal to the actual
len(log_frame['Eth']==log_frame['Predicted'])
(sum(log_frame['Eth'] == log_frame['Predicted']))/len(log_frame)
0.3225806451612903
# The counts of the ethnicities correctly predicted
log_frame[log_frame['Eth'] == log_frame['Predicted']].Eth.value_counts()
Chicano/Latino 33 American Indian 12 Name: Eth, dtype: int64
import statsmodels.api as sm
import statsmodels.formula.api as smf
## African American as the reference variable
# Summary table for significant variables
formula = "Rate ~ C(Eth)"
log_reg = smf.logit(formula, data=adm_frame).fit()
print(log_reg.params)
Optimization terminated successfully.
Current function value: 0.586023
Iterations 5
Logit Regression Results
==============================================================================
Dep. Variable: Rate No. Observations: 619
Model: Logit Df Residuals: 613
Method: MLE Df Model: 5
Date: Mon, 05 Dec 2022 Pseudo R-squ.: 0.08705
Time: 23:39:24 Log-Likelihood: -362.75
converged: True LL-Null: -397.33
Covariance Type: nonrobust LLR p-value: 1.523e-13
=============================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------
Intercept 0.2338 0.192 1.218 0.223 -0.142 0.610
C(Eth)[T.American Indian] -1.3232 0.321 -4.118 0.000 -1.953 -0.693
C(Eth)[T.Asian] 0.4479 0.277 1.615 0.106 -0.096 0.991
C(Eth)[T.Chicano/Latino] 0.7752 0.288 2.693 0.007 0.211 1.339
C(Eth)[T.International] 0.1570 0.285 0.551 0.581 -0.401 0.715
C(Eth)[T.White] 0.6550 0.283 2.314 0.021 0.100 1.210
=============================================================================================
Intercept 0.233832
C(Eth)[T.American Indian] -1.323180
C(Eth)[T.Asian] 0.447854
C(Eth)[T.Chicano/Latino] 0.775162
C(Eth)[T.International] 0.156990
C(Eth)[T.White] 0.654991
dtype: float64
import numpy as np
# Our odds ratios
odds_ratios = pd.DataFrame(
{
"OR": log_reg.params,
"Lower CI": log_reg.conf_int()[0],
"Upper CI": log_reg.conf_int()[1],
}
)
odds_ratios = np.exp(odds_ratios)
print(odds_ratios)
OR Lower CI Upper CI Intercept 1.263432 0.867208 1.840688 C(Eth)[T.American Indian] 0.266287 0.141865 0.499835 C(Eth)[T.Asian] 1.564951 0.908806 2.694822 C(Eth)[T.Chicano/Latino] 2.170943 1.234819 3.816749 C(Eth)[T.International] 1.169984 0.669624 2.044225 C(Eth)[T.White] 1.925126 1.105455 3.352565
# DECISION TREE
# Must encode as eth is categorical
adm_frame['Binary Rate'] = np.where(adm_frame['Rate'] >= 0.5, 1, 0)
adm_frame.head(5)
ohe = pd.get_dummies(data=adm_frame, columns=['Eth']) # Need to get dummy variables
ohe_frame = ohe
ohe_frame.head(3) # We can see that now eth is encoded
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from matplotlib import pyplot
## ETH AS X, RATE AS Y
X = ohe_frame.iloc[:, 7:14]
y = ohe_frame['Binary Rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# TRYING DECISION TREE REGRESSOR ON IT
clf = DecisionTreeRegressor()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
model = DecisionTreeRegressor()
# fit the model
model.fit(X, y)
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
plt.xlabel('Features')
plt.ylabel('Score')
pyplot.title("Feature Importance")
pyplot.show()
# From here, we get the same conclusions as before. Chicano/Latino and American Indian were most important.
Feature: 0, Score: 0.00183 Feature: 1, Score: 0.78606 Feature: 2, Score: 0.04898 Feature: 3, Score: 0.08575 Feature: 4, Score: 0.00000 Feature: 5, Score: 0.07739
from sklearn import tree
tree.plot_tree(clf)
# Tree plot
[Text(0.75, 0.9166666666666666, 'X[1] <= 0.5\nsquared_error = 0.097\nsamples = 433\nvalue = 0.597'), Text(0.625, 0.75, 'X[3] <= 0.5\nsquared_error = 0.067\nsamples = 372\nvalue = 0.651'), Text(0.5, 0.5833333333333334, 'X[5] <= 0.5\nsquared_error = 0.081\nsamples = 291\nvalue = 0.625'), Text(0.375, 0.4166666666666667, 'X[2] <= 0.5\nsquared_error = 0.093\nsamples = 215\nvalue = 0.601'), Text(0.25, 0.25, 'X[4] <= 0.5\nsquared_error = 0.107\nsamples = 133\nvalue = 0.574'), Text(0.125, 0.08333333333333333, 'squared_error = 0.071\nsamples = 69\nvalue = 0.553'), Text(0.375, 0.08333333333333333, 'squared_error = 0.145\nsamples = 64\nvalue = 0.597'), Text(0.5, 0.25, 'squared_error = 0.068\nsamples = 82\nvalue = 0.645'), Text(0.625, 0.4166666666666667, 'squared_error = 0.041\nsamples = 76\nvalue = 0.694'), Text(0.75, 0.5833333333333334, 'squared_error = 0.006\nsamples = 81\nvalue = 0.745'), Text(0.875, 0.75, 'squared_error = 0.149\nsamples = 61\nvalue = 0.266')]
# KNN PREDICTION
feature_cols = ['Rate']#, 'Value'
X = ohe_frame[feature_cols]
y = ohe_frame.iloc[:, 7:14]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test)) # Accuracy score
0.1827956989247312
y_pred = knn.predict(X_test)
cr = classification_report(y_test, y_pred)
print(cr) # American Indian and Chicano/Latino had the highest F1 scores.
# F1 is the harmonic mean of precicion and recall, the aforementioned two had higher accuracy.
precision recall f1-score support
0 0.50 0.22 0.31 41
1 0.44 0.74 0.55 19
2 0.50 0.03 0.06 30
3 0.45 0.17 0.24 30
4 0.75 0.10 0.18 30
5 1.00 0.06 0.11 36
micro avg 0.49 0.18 0.27 186
macro avg 0.61 0.22 0.24 186
weighted avg 0.62 0.18 0.22 186
samples avg 0.18 0.18 0.18 186
/Users/fish/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
f_list
['African American', 'American Indian', 'Asian', 'Chicano/Latino', 'International', 'White', 'micro avg', 'macro avg', 'weighted avg', 'samples avg']
# Converting to a dataframe so we can make it look nice in plotly
report = classification_report(y_test, y_pred, output_dict = True)
knncr = pd.DataFrame(report).transpose()
extra = ['micro avg', 'macro avg', 'weighted avg', 'samples avg']
f_list = e_list + extra
knncr.insert(0, "Eth and Avg", f_list) # adding a column that displays the eth and avg for clarification
/Users/fish/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.
fig = go.Figure(data=[go.Table(
header=dict(values=list(knncr.columns),
fill_color='lavenderblush',
align='left'),
cells=dict(values=[knncr['Eth and Avg'], knncr.precision, knncr.recall, knncr['f1-score'], knncr.support],
fill_color='lavender',
align='left'))
])
fig.update_layout(
title_text = "KNN Classification Report",
title_font_size=30,
title_x=0.5,
font_family="Times New Roman",
font_color="black",
title_font_family="Times New Roman",
title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 10))
fig.show()
# Lets try to use a different prediction model. Let's switch them around.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
X = ohe_frame.iloc[:, 7:14]
y = ohe_frame['Binary Rate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
0.8602150537634409
from sklearn.neighbors import KNeighborsClassifier
#Setup arrays to store training and test accuracies
neighbors = np.arange(1,7)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i,k in enumerate(neighbors):
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=k)
#Fit the model
knn.fit(X_train, y_train)
#Compute accuracy on the training set
train_accuracy[i] = knn.score(X_train, y_train)
#Compute accuracy on the test set
test_accuracy[i] = knn.score(X_test, y_test)
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()
# Accurate between the two
from sklearn.metrics import confusion_matrix
y_pred = knn.predict(X_test)
confusion_matrix(y_test,y_pred) # True Negative 22, False Positive 13,
# True Positive 60, False Negative 124
array([[ 22, 13],
[ 27, 124]])
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
# Way higher F1 scores
precision recall f1-score support
0 0.45 0.63 0.52 35
1 0.91 0.82 0.86 151
accuracy 0.78 186
macro avg 0.68 0.72 0.69 186
weighted avg 0.82 0.78 0.80 186
# Plot of ROC curve, indicating we have pretty high accuracy for lines closer to the left
y_pred_proba = knn.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=7) ROC curve')
plt.show()
# Summary table for which vars are most significant.
mlr = pd.get_dummies(data=final_frame, columns=['Eth'])
x = mlr[['GPA', 'Value', 'Eth_Asian','Eth_Chicano/Latino', 'Eth_International', 'Eth_White']]
y = mlr['Rate']
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(x, y)
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
# with statsmodels
x = sm.add_constant(x) # adding a constant
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
print_model = model.summary()
print(print_model)
Intercept:
0.028055734103871077
Coefficients:
[ 2.11277822e-01 5.22287539e-05 -1.50052021e-02 -4.66654219e-02
8.76753014e-02 -2.60046773e-02]
OLS Regression Results
==============================================================================
Dep. Variable: Rate R-squared: 0.307
Model: OLS Adj. R-squared: 0.274
Method: Least Squares F-statistic: 9.296
Date: Sat, 03 Dec 2022 Prob (F-statistic): 2.36e-07
Time: 12:54:37 Log-Likelihood: 160.78
No. Observations: 111 AIC: -309.6
Df Residuals: 105 BIC: -293.3
Df Model: 5
Covariance Type: nonrobust
======================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------
const 0.0224 0.220 0.102 0.919 -0.413 0.458
GPA 0.2113 0.078 2.719 0.008 0.057 0.365
Value 5.223e-05 5.16e-05 1.011 0.314 -5.02e-05 0.000
Eth_Asian -0.0094 0.058 -0.163 0.871 -0.124 0.105
Eth_Chicano/Latino -0.0411 0.051 -0.808 0.421 -0.142 0.060
Eth_International 0.0933 0.062 1.509 0.134 -0.029 0.216
Eth_White -0.0204 0.057 -0.357 0.722 -0.134 0.093
==============================================================================
Omnibus: 24.858 Durbin-Watson: 2.265
Prob(Omnibus): 0.000 Jarque-Bera (JB): 84.237
Skew: 0.672 Prob(JB): 5.11e-19
Kurtosis: 7.051 Cond. No. 1.66e+18
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.04e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.